Generate projections of DOC.

Apply the model to new environmental data to get a world map of POC projections.

Author

Thelma Panaïotis

Set-up and load data

#|output: false
#|cache: false
source("utils.R")
load("data/03.doc_from_env_nested_cv.Rdata")

Extract projections

# Unnest new predictions (i.e. projections)
new_preds <- res %>% 
  select(fold, cv_type, new_preds) %>% 
  unnest(new_preds) %>% 
  # Apply exp to predictions as we predicted log(doc)
  mutate(pred_doc = exp(pred_doc_log), .after = pred_doc_log) %>% 
  select(cv_type, fold, lon, lat, contains("doc"))

# Get those from stratified CV
new_preds_strat <- new_preds %>% filter(cv_type == "stratified")

# And those from spatial CV
new_preds_spat <- new_preds %>% filter(cv_type == "spatial")

Join projections with R² value for each fold and each cv_type.

# Unnest predictions on test set
preds <- res %>% select(fold, cv_type, preds) %>% unnest(preds)

# Compute Rsquare for each fold of each CV type
rsquares <- preds %>%
  group_by(cv_type, fold) %>%
  rsq(truth = doc_log, estimate = .pred) %>% 
  select(cv_type, fold, rsq = .estimate)

# Join with projections
new_preds_strat <- new_preds_strat %>% left_join(rsquares, by = join_by(cv_type, fold))
new_preds_spat <- new_preds_spat %>% left_join(rsquares, by = join_by(cv_type, fold))

Generate maps

For both types of CV, we use all the folds to generate and average prediction, weighted by R². Similarly, we also compute a weighted standard deviation.

Average by pixel

# Stratified
strat_proj <- new_preds_strat %>% 
  group_by(lon, lat) %>% 
  summarise(
    doc_avg = wtd.mean(pred_doc, weights = rsq, na.rm = TRUE), 
    doc_sd = sqrt(wtd.var(pred_doc, weights = rsq, na.rm = TRUE)), 
    .groups = "drop"
    )

# Spatial
spat_proj <- new_preds_spat %>% 
  group_by(lon, lat) %>% 
  summarise(
    doc_avg = wtd.mean(pred_doc, weights = rsq, na.rm = TRUE), 
    doc_sd = sqrt(wtd.var(pred_doc, weights = rsq, na.rm = TRUE)), 
    .groups = "drop"
    )

# Generate common colour bar limits for both CV types
doc_avg_lims <- c(
  min(c(strat_proj$doc_avg, spat_proj$doc_avg)), 
  max(c(strat_proj$doc_avg, spat_proj$doc_avg))  
)

doc_sd_lims <- c(
  min(c(strat_proj$doc_sd, spat_proj$doc_sd)), 
  max(c(strat_proj$doc_sd, spat_proj$doc_sd))  
)

Let’s now plot maps with a common colour scale for both CV types.

Stratified CV

ggplot(strat_proj) + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = doc_avg)) + 
  ggplot2::scale_fill_viridis_c(option = "F", limits = doc_avg_lims) +
  labs(title = "DOC avg from stratified CV") +
  coord_quickmap(expand = 0)

ggplot(strat_proj) + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = doc_sd)) + 
  ggplot2::scale_fill_viridis_c(trans = "log1p", option = "E", limits = doc_sd_lims) +
  labs(title = "DOC sd from stratified CV") +
  coord_quickmap(expand = 0)

Btw, how do projections vary across CV folds? Let’s plot it.

new_preds_strat %>% 
  # Add information of R² to cv name
  mutate(fold = paste0(fold, " (R² = ", percent(rsq, accuracy = 0.1), ")")) %>% 
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = pred_doc)) + 
  ggplot2::scale_fill_viridis_c(option = "F") +
  labs(title = "DOC pred across stratified CV folds") +
  coord_quickmap(expand = 0) +
  facet_wrap(~fold)

Note

Folds are relatively similar, not surprising given stratified CV was constrained on deciles of DOC values.

Spatial CV

ggplot(spat_proj) + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = doc_avg)) + 
  ggplot2::scale_fill_viridis_c(option = "F", limits = doc_avg_lims) +
  labs(title = "DOC avg from spatial CV") +
  coord_quickmap(expand = 0)

ggplot(spat_proj) + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = doc_sd)) + 
  ggplot2::scale_fill_viridis_c(trans = "log1p", option = "E", limits = doc_sd_lims) +
  labs(title = "DOC sd from spatial CV") +
  coord_quickmap(expand = 0)

Sd across folds is higher for spatial CV, we can expect more variation across folds, let’s plot it.

new_preds_spat %>% 
  mutate(fold = paste0(fold, " (R² = ", percent(rsq, accuracy = 0.1), ")")) %>% 
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = pred_doc)) + 
  ggplot2::scale_fill_viridis_c(option = "F") +
  labs(title = "DOC pred across spatial CV folds") +
  coord_quickmap(expand = 0) +
  facet_wrap(~fold)

Note

As expected, more variation can be detected across folds.

Check the difference between stratified and spatial CV

strat_proj %>% 
  select(lon, lat, strat = doc_avg) %>% 
  left_join(spat_proj %>% select(lon, lat, spat = doc_avg), by = join_by(lon, lat)) %>% 
  mutate(diff = strat - spat) %>% 
  ggplot() + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = diff)) + 
  scale_fill_gradient2(low = "#4575b4", mid = "#ffffbf", high = "#d73027") +
  labs(title = "DOC stratified CV - DOC spatial CV") +
  coord_quickmap(expand = 0)

Conclusion

No obvious difference between both (except in the North Sea), likely due to averaging across folds.